Read in data
> european <- read_csv("01-cleaning_data_data/european_recoded.csv")
> australian <- read_csv("01-cleaning_data_data/australian_recoded.csv")
>
> dim(european)
## [1] 209 115
> dim(australian)
## [1] 269 146
> european$EU <- 1
> australian$AU <- 1
>
> all <- merge(european,australian,all = TRUE)
>
> table(all$EU,all$AU,useNA = "always")
##
## 1 <NA>
## 1 0 209
## <NA> 269 0
Variables to describe dataset (can be different between contexts)
> demographics_var <- c("Age","Gender","L1","speak.other.L2","study.other.L2","origins","year.studyL2","other5.other.ways","degree","roleL2.degree","study.year","prof","L2.VCE","uni1.year","Context")
> l2School <- "\\.L2school$"
> l2School_variables <- colnames(all)[grep(l2School,colnames(all))]
> #table(all$L1,all$Context) # too many levels - needs to be cleaned (ex tot number of languages?)
> table(all$L1,useNA = "always")
##
## Afrikaans Albanian Burmese
## 1 2 1
## Cantonese Chinese Croatian
## 4 7 1
## Dutch English English and Dutch
## 1 201 2
## German German and English German and Turkish
## 77 2 1
## I Indonesian Italian
## 1 1 89
## Japanese Mandarin Persian
## 1 5 1
## Persian (Farsi) Romanian Russian
## 1 3 2
## Sindhi Slovak Spanish
## 1 1 3
## Turkish Ukrainian <NA>
## 1 1 67
> ggplot(all,aes(x=L1,fill=Context)) + geom_bar() + coord_flip() + ggtitle("First Language") + labs(y="N. of participants",x="")+theme_bw()

> #table(all$speak.other.L2,all$Context)
> L2 <- data.frame(Freq=table(all$speak.other.L2)[order(table(all$speak.other.L2),decreasing = TRUE)],
+ L2=names(table(all$speak.other.L2))[order(table(all$speak.other.L2),decreasing = TRUE)]) # too many levels - needs to be cleaned (ex tot number of languages?)
> head(L2)
## Freq.Var1 Freq.Freq L2
## 1 No 171 No
## 2 Yes 143 Yes
## 3 French 13 French
## 4 English 9 English
## 5 Italian 7 Italian
## 6 Japanese 4 Japanese
> table(all$origins,useNA = "always")
##
## No Yes <NA>
## 323 89 66
> table(all$year.studyL2)
##
## 0 years 1- 3 years
## 69 12
## 1-3 years 4-6 years
## 11 61
## First year of primary school Kindergarten
## 80 30
## Less than a year more than 6 years
## 27 49
## Other
## 72
> table(all$degree)
##
## BA in Anglistik BA in Nordamerikastudien
## 43 4
## HUM HUM.SCI
## 129 6
## LA Lingue e letterature straniere
## 36 82
## Lingue, mercati e culture dell'Asia QC
## 13 5
## SCI
## 86
- study.year in the European context is uni1.year in the Australian context
> all$study.year[is.na(all$study.year)] <- all$uni1.year[is.na(all$study.year)]
> #table(all$study.year)
> all$study.year <- ifelse(all$study.year == "Already graduated after 5 semesters in March 2016, was interested in survery/study, sorry.","6th semester",all$study.year)
> table(all$study.year)
##
## 1st semester 1st year 2nd semester
## 72 255 5
## 2nd year 3rd semester 3rd year
## 37 5 20
## 3rd year of Master 4th year bachelor 5th semester
## 1 8 2
## 6th semester 7th year Master
## 3 1 3
> table(all$prof,useNA = "always")
##
## Advanced Elementary Intermediate
## 78 105 84
## Upper-intermediate <NA>
## 146 65
Filter participants : keep only the ones that meet the inclusion criteria
> all$study.year[is.na(all$study.year)] <- all$uni1.year[is.na(all$study.year)]
> # Filter only subject that we want to include in the study
> # names(table(all$study.year))[1] = 1st semester"
>
> filtered <- subset(all, (study.year == "1st year") | (study.year == names(table(all$study.year))[1]))
> #& year.studyL2 != "0 years"
Define demographic variables
Need to check datasets with Rcihi (why do we have QC in L1? Am I using not the latest dataset?)
> all <- filtered
> demographics_var <- c("Age","Gender","L1","speak.other.L2","study.other.L2","origins","year.studyL2","other5.other.ways","degree","roleL2.degree","study.year","prof","L2.VCE","uni1.year","Context")
> l2School <- "\\.L2school$"
> l2School_variables <- colnames(all)[grep(l2School,colnames(all))]
>
> ggplot(all,aes(x=L1,fill=Context)) + geom_bar() + coord_flip() + ggtitle("First Language") + labs(y="N. of participants",x="") + theme_bw()

> table(all$L1,all$Context)
##
## English in Germany English in Italy
## Afrikaans 0 0
## Albanian 0 1
## Cantonese 0 0
## Chinese 0 2
## Dutch 1 0
## English 1 0
## English and Dutch 0 0
## German 64 0
## German and English 1 0
## I 0 0
## Indonesian 0 0
## Italian 0 87
## Japanese 0 0
## Mandarin 0 0
## Persian (Farsi) 0 0
## Romanian 0 0
## Russian 2 0
## Sindhi 0 0
## Spanish 1 0
## Turkish 1 0
## Ukrainian 0 1
##
## German in Australia Italian in Australia
## Afrikaans 1 0
## Albanian 0 0
## Cantonese 2 0
## Chinese 2 0
## Dutch 0 0
## English 75 73
## English and Dutch 2 0
## German 0 0
## German and English 1 0
## I 0 1
## Indonesian 1 0
## Italian 0 0
## Japanese 1 0
## Mandarin 1 1
## Persian (Farsi) 1 0
## Romanian 1 0
## Russian 0 0
## Sindhi 1 0
## Spanish 0 0
## Turkish 0 0
## Ukrainian 0 0
> table(all$degree,all$L1)
##
## Afrikaans Albanian Cantonese Chinese
## BA in Anglistik 0 0 0 0
## BA in Nordamerikastudien 0 0 0 0
## HUM 1 0 2 0
## HUM.SCI 0 0 0 0
## LA 0 0 0 0
## Lingue e letterature straniere 0 1 0 1
## Lingue, mercati e culture dell'Asia 0 0 0 1
## SCI 0 0 0 2
##
## Dutch English English and Dutch
## BA in Anglistik 0 1 0
## BA in Nordamerikastudien 0 0 0
## HUM 0 93 1
## HUM.SCI 0 6 0
## LA 1 0 0
## Lingue e letterature straniere 0 0 0
## Lingue, mercati e culture dell'Asia 0 0 0
## SCI 0 47 1
##
## German German and English I
## BA in Anglistik 34 1 0
## BA in Nordamerikastudien 4 0 0
## HUM 0 0 1
## HUM.SCI 0 0 0
## LA 25 0 0
## Lingue e letterature straniere 0 0 0
## Lingue, mercati e culture dell'Asia 0 0 0
## SCI 0 1 0
##
## Indonesian Italian Japanese Mandarin
## BA in Anglistik 0 0 0 0
## BA in Nordamerikastudien 0 0 0 0
## HUM 0 0 0 0
## HUM.SCI 0 0 0 0
## LA 0 0 0 0
## Lingue e letterature straniere 0 75 0 0
## Lingue, mercati e culture dell'Asia 0 12 0 0
## SCI 1 0 1 2
##
## Persian (Farsi) Romanian Russian
## BA in Anglistik 0 0 2
## BA in Nordamerikastudien 0 0 0
## HUM 0 0 0
## HUM.SCI 0 0 0
## LA 0 0 0
## Lingue e letterature straniere 0 0 0
## Lingue, mercati e culture dell'Asia 0 0 0
## SCI 1 1 0
##
## Sindhi Spanish Turkish Ukrainian
## BA in Anglistik 0 1 0 0
## BA in Nordamerikastudien 0 0 0 0
## HUM 0 0 0 0
## HUM.SCI 0 0 0 0
## LA 0 0 1 0
## Lingue e letterature straniere 0 0 0 1
## Lingue, mercati e culture dell'Asia 0 0 0 0
## SCI 1 0 0 0
- Check for L1 but we decided not to filter for it
> #Filter by L1
>
> nc <- names(table(all$Context))
> table(all$Context)
##
## English in Germany English in Italy German in Australia
## 72 91 89
## Italian in Australia
## 75
> l1_filter <- all[(all$Context == nc[1] & (all$L1 == "German" | all$L1 == "German and English")) |
+ (all$Context == nc[2] & (all$L1 == "Italian")) |
+ (all$Context == nc[3] & (all$L1 == "English" | all$L1 == "English and Dutch" | all$L1 == "German and English")) |
+ (all$Context == nc[4] & (all$L1 == "English" | all$L1 == "English and Dutch" | all$L1 == "German and English")),]
>
>
> #all <- l1_filter
>
> # do not filter for L1
> all <- all
>
> # subset demographics
> demo <- subset(all,select=c("Resp.ID",demographics_var,l2School_variables))
>
> # Numeri finali
> table(l1_filter$Context)
##
## English in Germany English in Italy German in Australia
## 65 87 78
## Italian in Australia
## 73
> table(all$Context)
##
## English in Germany English in Italy German in Australia
## 72 91 89
## Italian in Australia
## 75
- Filter missing value:
- Filter participants who didn’t put the degree
- we don’t care about speak.other.L2 and study.other.L2
> missing_bySample <- rowSums(is.na(demo))
> names(missing_bySample) <- demo$Resp.ID
> missing_byVar <- colSums(is.na(demo))
> names(missing_byVar) <- colnames(demo)
>
> barplot(missing_bySample)

> d <- data.frame(miss=missing_byVar)
> d$varID <- rownames(d)
> ggplot(data=d,aes(x=varID,y=miss)) + geom_bar(stat="identity") + theme_bw() +theme(axis.text.x = element_text(angle = 45, hjust = 1))

> demo_missing <- demo %>% group_by(Context) %>% summarise(roleL2.degree_na = sum(is.na(roleL2.degree)),
+ L2.VCE_na = sum(is.na(L2.VCE)),
+ other5.other.ways_na=sum(is.na(other5.other.ways )),
+ uni1.year_na = sum(is.na(uni1.year)),
+ primary1.L2school_na=sum(is.na(primary1.L2school)),
+ CLS3.L2school_na = sum(is.na(CLS3.L2school)),
+ VSL4.L2school_na=sum(is.na(VSL4.L2school)),
+ degree = sum(is.na(degree)),
+ schooL2country5.L2school_na=sum(is.na(schooL2country5.L2school)))
>
> # We do not filter for speak.other.L2 or study.other.L2
>
> #demo[is.na(demo$speak.other.L2),]
> # teniamo
> #demo[is.na(demo$study.other.L2),]
> missing_bySample[names(missing_bySample) == "5166861581"]
## 5166861581
## 10
> #demo[is.na(demo$year.studyL2),]
> missing_bySample[names(missing_bySample) == "5378798787"]
## 5378798787
## 3
> # remove NA from degree
> #table(demo$degree,useNA = "always")
> # Remove people
> all <- all[!is.na(all$degree),]
Stats about filtered dataset
> kable(table(all$Context))
| English in Germany |
70 |
| English in Italy |
91 |
| German in Australia |
88 |
| Italian in Australia |
74 |
> kable(table(all$study.year))
| 1st semester |
70 |
| 1st year |
253 |
> kable(table(all$year.studyL2))
| 0 years |
33 |
| 1- 3 years |
9 |
| 1-3 years |
7 |
| 4-6 years |
53 |
| First year of primary school |
73 |
| Kindergarten |
29 |
| Less than a year |
18 |
| more than 6 years |
41 |
| Other |
59 |
Recoded demographic variables
> recoded_dem_richi <- read_excel("02-descriptive_data/21 03 merged_filtered_imputedMedian_likertNumber.xlsx")
Write filtered and merged dataset
> write.csv(all,file.path("02-descriptive_data/context-merged_filtered.csv"))
Descriptive plots and tables
- Summary demographics TO DO: - to change yearL2.study.richi
> # add numbers on the bar
>
> # tabAge <- t(table(all$Age,all$Context))
> # ggplot(all,aes(x=Age,fill=Context)) + geom_bar(position="dodge",colour="white") + labs(y="N participants") + scale_y_continuous(breaks=seq(0,90,10),limits=c(0,90)) + theme_bw() + draw_grob(tableGrob(tabAge), x=2.5, y=40, width=0.3, height=0.4) + ggtitle("Participants by age")
> # tabAge
>
> tabAge <- t(table(all$Age,all$Context))
> ggdf <- data.frame(Age = rep(colnames(tabAge),each=4)[!(as.numeric(tabAge) == 0)],
+ N.Participants = as.numeric(tabAge)[!(as.numeric(tabAge) == 0)],
+ Context = rep(rownames(tabAge),times=3)[!(as.numeric(tabAge) == 0)])
>
> ggplot(ggdf,aes(x=Age,y=N.Participants,fill=Context)) + geom_bar(position="dodge",colour="white",stat="identity") + scale_y_continuous(breaks=seq(0,90,10),limits=c(0,90)) + theme_bw() + ggtitle("Participants by age")+
+ geom_text(aes(label = N.Participants), hjust=0.5, vjust=-0.25, size = 2.5,position=position_dodge(width=0.9))

> # add numbers on the bar
>
> tabAge <- t(table(all$Gender,all$Context))
> ggdf <- data.frame(Gender = rep(colnames(tabAge),each=4)[!(as.numeric(tabAge) == 0)],
+ N.Participants = as.numeric(tabAge)[!(as.numeric(tabAge) == 0)],
+ Context = rep(rownames(tabAge),times=3)[!(as.numeric(tabAge) == 0)])
>
>
> ggplot(ggdf,aes(x=Gender,y=N.Participants,fill=Context)) + geom_bar(position="dodge",colour="white",stat="identity") + labs(y="N participants") + scale_y_continuous(breaks=seq(0,90,10),limits=c(0,90)) + theme_bw() + ggtitle("Participants by gender")+ geom_text(aes(label = N.Participants), hjust=0.5, vjust=-0.25, size = 2.5,position=position_dodge(width=0.9))

> # add numbers on the bar
> tabAge <- t(table(all$origins,all$Context))
> ggplot(all,aes(x=origins,fill=Context)) + geom_bar(position="dodge",colour="white") + ggtitle("Origins by context") + scale_y_continuous(breaks=seq(0,90,10),limits=c(0,90)) + theme_bw() + draw_grob(tableGrob(tabAge), x=2, y=60, width=0.3, height=0.4) + ggtitle("Participants by origins")

> tabAge
##
## No Yes
## English in Germany 65 5
## English in Italy 90 1
## German in Australia 63 25
## Italian in Australia 36 38
> tabAge <- t(table(all$prof,all$Context))
> ggplot(all,aes(x=Context,fill=prof)) + geom_bar(position="dodge",colour="white") + ggtitle("Proficiency by context") + scale_y_continuous(breaks=seq(0,90,10),limits=c(0,90)) + theme_bw() + draw_grob(tableGrob(tabAge), x=2, y=80, width=0.3, height=0.4)

> tabAge
##
## Advanced Elementary Intermediate Upper-intermediate
## English in Germany 38 0 5 27
## English in Italy 23 2 9 57
## German in Australia 4 32 25 27
## Italian in Australia 0 29 29 16
> tabAge <- t(table(all[all$Context != "English in Germany" & all$Context != "English in Italy","L2.VCE"],all[all$Context != "English in Germany" & all$Context != "English in Italy",'Context'],useNA = "always"))
> tabAge <- tabAge[-3,]
>
> ggplot(all[all$Context != "English in Germany" & all$Context != "English in Italy",],aes(x=Context,fill=L2.VCE)) + geom_bar(position="dodge",colour="white") + ggtitle("L2.VCE by context") + scale_y_continuous(breaks=seq(0,90,10),limits=c(0,90)) + theme_bw() + draw_grob(tableGrob(tabAge), x=2, y=80, width=0.3, height=0.4)

- da mettere a posto con Richi
> # year study L2
> table(all$year.studyL2,all$other.year.studyL2.richi)
##
## BILINGUAL FIRST.YEAR.SECONDARY
## 0 years 0 0
## 1- 3 years 0 0
## 1-3 years 0 0
## 4-6 years 0 0
## First year of primary school 0 0
## Kindergarten 0 0
## Less than a year 0 0
## more than 6 years 0 0
## Other 4 10
##
## FOURTH.YEAR.PRIMARY LOWER.SECONDARY
## 0 years 0 0
## 1- 3 years 0 0
## 1-3 years 0 0
## 4-6 years 0 0
## First year of primary school 0 0
## Kindergarten 0 0
## Less than a year 0 0
## more than 6 years 0 0
## Other 5 4
##
## PERSONAL SECOND.YEAR.PRIMARY
## 0 years 0 0
## 1- 3 years 0 0
## 1-3 years 0 0
## 4-6 years 0 0
## First year of primary school 0 0
## Kindergarten 0 0
## Less than a year 0 0
## more than 6 years 0 0
## Other 2 2
##
## SECOND.YEAR.SECONDARY THIRD.YEAR.PRIMARY
## 0 years 0 0
## 1- 3 years 0 0
## 1-3 years 0 0
## 4-6 years 0 0
## First year of primary school 0 0
## Kindergarten 0 0
## Less than a year 0 0
## more than 6 years 0 0
## Other 2 28
> all$year.studyL2 <- ifelse(all$year.studyL2 == "Other",all$other.year.studyL2.richi,all$year.studyL2 )
>
> # European context
> ggplot(all[all$Context == "English in Germany" | all$Context == "English in Italy",],aes(x=degree,fill=year.studyL2)) + geom_bar(position="dodge",colour="white") + theme_bw() + ggtitle("Degree by study year L2, by Context") + facet_grid(~Context,scales="free") + theme(axis.text.x = element_text(angle = 45, hjust = 1)) + labs(y = "N participants", x = "degree")

> # Australian context
> tabAge <- t(table(all[all$Context == "Italian in Australia" | all$Context == "German in Australia",'degree'],all[all$Context == "Italian in Australia" | all$Context == "German in Australia",'Context']))
> ggplot(all[all$Context == "Italian in Australia" | all$Context == "German in Australia",],aes(x=Context,fill=degree)) + geom_bar(position="dodge",colour="white") + theme_bw() + ggtitle("Degree in Australian Contexts") + draw_grob(tableGrob(tabAge), x=1., y=40, width=0.3, height=0.4)

> tabAge
##
## HUM HUM.SCI SCI
## German in Australia 48 4 36
## Italian in Australia 50 2 22
> # Australian context
> tabAge <- t(table(all[all$Context == "English in Italy" | all$Context == "English in Germany",'degree'],all[all$Context == "English in Italy" | all$Context == "English in Germany",'Context']))
>
> ggplot(all[all$Context == "English in Italy" | all$Context == "English in Germany",],aes(x=Context,fill=degree)) + geom_bar(position="dodge",colour="white") + theme_bw() + ggtitle("Degree in European Contexts")

> tabAge
##
## BA in Anglistik BA in Nordamerikastudien LA
## English in Germany 39 4 27
## English in Italy 0 0 0
##
## Lingue e letterature straniere
## English in Germany 0
## English in Italy 78
##
## Lingue, mercati e culture dell'Asia
## English in Germany 0
## English in Italy 13
Australian context spcific variables
> kable(table(all$reconnect.comm,all$Context))
| Agree |
0 |
0 |
8 |
11 |
| Disagree |
0 |
0 |
35 |
14 |
| Not sure |
0 |
0 |
3 |
4 |
| Strongly agree |
0 |
0 |
12 |
28 |
| Strongly disagree |
0 |
0 |
30 |
17 |
> kable(table(all$speakersmelb.comm,all$Context))
| Agree |
0 |
0 |
44 |
41 |
| Disagree |
0 |
0 |
6 |
2 |
| Not sure |
0 |
0 |
25 |
12 |
| Strongly agree |
0 |
0 |
12 |
19 |
| Strongly disagree |
0 |
0 |
1 |
0 |
> kable(table(all$comecloser.comm,all$Context))
| Agree |
0 |
0 |
21 |
34 |
| Disagree |
0 |
0 |
16 |
6 |
| Not sure |
0 |
0 |
43 |
17 |
| Strongly agree |
0 |
0 |
6 |
17 |
| Strongly disagree |
0 |
0 |
2 |
0 |
Likert scales
- Convert Likert scales to numbers
> convertToNumber <- function(column){
+ column <- factor(column,levels = c("Strongly disagree","Disagree","Not sure","Agree","Strongly agree"))
+ column_number <- as.numeric(column)
+ return(column_number)
+ }
>
> table(all$Context)
##
## English in Germany English in Italy German in Australia
## 70 91 88
## Italian in Australia
## 74
> table(all$study.year)
##
## 1st semester 1st year
## 70 253
> convert_likert <- data.frame(apply(subset(all,select=likert_variables_all),2,convertToNumber))
> colnames(convert_likert) <- paste0(colnames(convert_likert),"1")
>
> likert_variables1 <- paste0(likert_variables_all,"1")
>
> # join the converted variables to the filtered dataset
> filtered_conv <- cbind(all,convert_likert)
>
> table(filtered_conv[,likert_variables_all[4]],filtered_conv[,likert_variables1[4]],useNA = "always")
##
## 1 2 3 4 5 <NA>
## Agree 0 0 0 121 0 0
## Disagree 0 10 0 0 0 0
## Not sure 0 0 39 0 0 0
## Strongly agree 0 0 0 0 152 0
## Strongly disagree 1 0 0 0 0 0
## <NA> 0 0 0 0 0 0
> write.csv(filtered_conv,"02-descriptive_data/merged_filtered_likertNumber.csv",row.names = FALSE)
Impute missing values - Using median values
The missing values appears to be at random and there are max two missing values in one variable (see plots below). In order not to loose 12 participants while doing the factor analysis across contexts it is preferable to impute the 12 missing values.
> all <- filtered_conv
>
> # Items to use for factor analysis : items shared between contexts
> # items to be used for the FA
>
> usable_items <- likert_variables1[!(likert_variables1 %in% c("necessity1","educated1","reconnect.comm1", "speakersmelb.comm1", "comecloser.comm1"))]
> rownames(all) <- all$Resp.ID
> usable_data_context <- all[,c(usable_items,"Context")]
> dat_noNA <- usable_data_context[rowSums(is.na(usable_data_context)) == 0,]
> all_noNA <- all[rowSums(is.na(usable_data_context)) == 0,]
> table(rowSums(is.na(usable_data_context)))
##
## 0 1
## 311 12
> # Participants with NA to remove
> table(rowSums(is.na(usable_data_context)),usable_data_context$Context)
##
## English in Germany English in Italy German in Australia
## 0 70 85 87
## 1 0 6 1
##
## Italian in Australia
## 0 69
## 1 5
> # check what to use to impute
> # have a look at the distribution of missing values
> library(mice)
> library(VIM)
>
> mice_plot <- aggr(usable_data_context[,usable_items], col=c('navyblue','yellow'),
+ numbers=TRUE, sortVars=TRUE,
+ labels=names(usable_data_context[,usable_items]), cex.axis=.4,
+ gap=1, ylab=c("Missing data","Pattern"),cex.numbers=0.5)

##
## Variables sorted by number of missings:
## Variable Count
## time.integr1 0.006191950
## expect.ought1 0.003095975
## life.intr1 0.003095975
## job.instru1 0.003095975
## knowledge.instru1 0.003095975
## career.instru1 0.003095975
## money.instru1 0.003095975
## meeting.integr1 0.003095975
## citizen.post1 0.003095975
## interact.post1 0.003095975
## globalaccess.post1 0.003095975
## converse.id1 0.000000000
## dream.id1 0.000000000
## usewell.id1 0.000000000
## whenever.id1 0.000000000
## consider.ought1 0.000000000
## people.ought1 0.000000000
## fail.ought1 0.000000000
## enjoy.intr1 0.000000000
## exciting.intr1 0.000000000
## challenge.intr1 0.000000000
## becomelike.integr1 0.000000000
## affinity.integr1 0.000000000
## improve.prof1 0.000000000
## speaking.prof1 0.000000000
## reading.prof1 0.000000000
## written.prof1 0.000000000
## listening.prof1 0.000000000
## overseas.post1 0.000000000
> # Imputing using median
> library(Hmisc)
> imputedMedian <- usable_data_context
>
> imputedMedian$globalaccess.post1 <- with(imputedMedian[,usable_items], impute(globalaccess.post1, median))
> imputedMedian$citizen.post1 <- with(imputedMedian[,usable_items], impute(citizen.post1, median))
> imputedMedian$money.instru1 <- with(imputedMedian[,usable_items], impute(money.instru1, median))
> imputedMedian$knowledge.instru1 <- with(imputedMedian[,usable_items], impute(knowledge.instru1, median))
> imputedMedian$life.intr1 <- with(imputedMedian[,usable_items], impute(life.intr1, median))
> imputedMedian$time.integr1 <- with(imputedMedian[,usable_items], impute(time.integr1, median))
> imputedMedian$expect.ought1 <- with(imputedMedian[,usable_items], impute(expect.ought1, median))
> imputedMedian$job.instru1 <- with(imputedMedian[,usable_items], impute(job.instru1, median))
> imputedMedian$career.instru1 <- with(imputedMedian[,usable_items], impute(career.instru1, median))
> imputedMedian$meeting.integr1 <- with(imputedMedian[,usable_items], impute(meeting.integr1, median))
> imputedMedian$interact.post1 <- with(imputedMedian[,usable_items], impute(interact.post1, median))
>
>
> # check before after
> table(imputedMedian$time.integr1)
##
## 2 3 4 5
## 3 21 95 204
> table(usable_data_context$time.integr1)
##
## 2 3 4 5
## 3 21 95 202
> table(imputedMedian$life.intr1)
##
## 1 2 3 4 5
## 8 79 81 117 38
> table(usable_data_context$life.intr1)
##
## 1 2 3 4 5
## 8 79 80 117 38
> table(imputedMedian$knowledge.instru1)
##
## 1 2 3 4 5
## 1 2 29 189 102
> table(usable_data_context$knowledge.instru1)
##
## 1 2 3 4 5
## 1 2 29 188 102
> table(imputedMedian$money.instru1)
##
## 1 2 3 4 5
## 3 38 179 84 19
> table(usable_data_context$money.instru1)
##
## 1 2 3 4 5
## 3 38 178 84 19
> table(imputedMedian$citizen.post1)
##
## 1 2 3 4 5
## 3 22 75 148 75
> table(usable_data_context$citizen.post1)
##
## 1 2 3 4 5
## 3 22 75 147 75
> table(imputedMedian$globalaccess.post1)
##
## 1 2 3 4 5
## 1 3 20 159 140
> table(usable_data_context$globalaccess.post1)
##
## 1 2 3 4 5
## 1 3 20 158 140
> table(imputedMedian$expect.ought1)
##
## 1 2 3 4 5
## 126 142 30 21 4
> table(usable_data_context$expect.ought1)
##
## 1 2 3 4 5
## 126 141 30 21 4
> table(imputedMedian$job.instru1)
##
## 2 3 4 5
## 13 103 133 74
> table(usable_data_context$job.instru1)
##
## 2 3 4 5
## 13 103 132 74
> table(imputedMedian$career.instru1)
##
## 1 2 3 4 5
## 1 1 63 131 127
> table(usable_data_context$career.instru1)
##
## 1 2 3 4 5
## 1 1 63 130 127
> table(imputedMedian$meeting.integr1)
##
## 2 3 4 5
## 1 10 121 191
> table(usable_data_context$meeting.integr1)
##
## 2 3 4 5
## 1 10 121 190
> table(imputedMedian$interact.post1)
##
## 2 3 4 5
## 1 19 140 163
> table(usable_data_context$interact.post1)
##
## 2 3 4 5
## 1 19 140 162
- Substitute imputed data for the common variables to be used in the Factor Analysis
> all <- all[,!(colnames(all) %in% usable_items)]
> imputedMedian$Context <- NULL
> sum(!(colnames(imputedMedian) %in% usable_items))
## [1] 0
> all <- cbind(all,imputedMedian[match(rownames(imputedMedian),all$Resp.ID),])
Save imputed data
> write.csv(all,"02-descriptive_data/merged_filtered_imputedMedian_likertNumber.csv",row.names = FALSE)
Barplot of likert variables
> all_melt <- melt(all,id.vars = c("Resp.ID","Gender","Age","prof","Context","study.year"),
+ measure.vars = likert_variables1)
## Warning: attributes are not identical across measure variables; they will
## be dropped
> all_melt$value <- factor(all_melt$value,levels=c(1,2,3,4,5),labels=c("Strongly disagree","Disagree","Not sure","Agree","Strongly agree"))
> # dim(all_melt)
> # 323*length(likert_variables1)
>
> all_melt <- all_melt %>% separate(variable,into=c("item","type"),sep="\\.",remove=FALSE)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 646 rows
## [9368, 9369, 9370, 9371, 9372, 9373, 9374, 9375, 9376, 9377, 9378, 9379,
## 9380, 9381, 9382, 9383, 9384, 9385, 9386, 9387, ...].
> ggplot(all_melt,aes(x=variable,fill=value)) + geom_bar(position = "stack",colour="black") +
+ facet_grid(Context~type,scales = "free")+theme(axis.text.x = element_text(angle = 45, hjust = 1),axis.text=element_text(size=8)) + ggtitle("Filtered dataset") + scale_fill_manual(values=c("#ca0020","#f4a582","#ffffbf","#abd9e9","#2c7bb6","grey"))

> filt_sum <- all_melt %>% group_by(Context,variable,type,value) %>% dplyr::summarise(Ngroup=length(value))
> ggplot(filt_sum,aes(x=value,y=Ngroup,colour=Context,group=interaction(variable, Context))) + geom_line() + geom_point() + facet_wrap(~type,scales = "free")+theme(axis.text.x = element_text(angle = 45, hjust = 1))

Barplot of Educated and Necessity in the Australian and European Contexts
> # add numbers on the bar
> educated <- all[all$Context %in% c("German in Australia","Italian in Australia"),]
> table(educated$educated1,educated$Context,useNA="always")
##
## German in Australia Italian in Australia <NA>
## 1 11 9 0
## 2 25 24 0
## 3 12 13 0
## 4 29 18 0
## 5 11 10 0
## <NA> 0 0 0
> educated$educated1 <- factor(educated$educated1,levels = c(1,2,3,4,5),labels=c("Strongly disagree","Disagree","Not sure","Agree","Strongly agree"))
>
> tabEdu <- t(table(educated$educated1,educated$Context))
> ggdf <- data.frame(Educated = rep(colnames(tabEdu),each=2),
+ N.Participants = as.numeric(tabEdu),
+ Context = rep(rownames(tabEdu),times=5))
>
>
> ggplot(ggdf,aes(x=Educated,y=N.Participants,fill=Context)) + geom_bar(position="dodge",colour="white",stat="identity") + labs(y="N participants") + scale_y_continuous(breaks=seq(0,35,10),limits=c(0,35)) + theme_bw() + ggtitle("Educated by Context")+ geom_text(aes(label = N.Participants), hjust=0.5, vjust=-0.25, size = 2.5,position=position_dodge(width=0.9))

> ggplot(ggdf,aes(x=Context,y=N.Participants,fill=Educated)) + geom_bar(position="dodge",colour="white",stat="identity") + labs(y="N participants") + scale_y_continuous(breaks=seq(0,35,10),limits=c(0,35)) + theme_bw() + ggtitle("Educated by Context")+ geom_text(aes(label = N.Participants), hjust=0.5, vjust=-0.25, size = 2.5,position=position_dodge(width=0.9))

> # add numbers on the bar
> necessity <- all[all$Context %in% c("English in Germany","English in Italy"),]
> table(necessity$necessity1,necessity$Context,useNA="always")
##
## English in Germany English in Italy <NA>
## 1 1 12 0
## 2 7 16 0
## 3 13 6 0
## 4 32 36 0
## 5 16 20 0
## <NA> 1 1 0
> necessity$necessity1 <- factor(necessity$necessity1,levels = c(1,2,3,4,5),labels=c("Strongly disagree","Disagree","Not sure","Agree","Strongly agree"))
>
> tabNec <- t(table(necessity$necessity1,necessity$Context,useNA = "always"))[-3,]
> ggdf <- data.frame(Necessity = rep(colnames(tabNec),each=2),
+ N.Participants = as.numeric(tabNec),
+ Context = rep(rownames(tabNec),times=6))
>
>
> ggplot(ggdf,aes(x=Necessity,y=N.Participants,fill=Context)) + geom_bar(position="dodge",colour="white",stat="identity") + labs(y="N participants") + scale_y_continuous(breaks=seq(0,40,10),limits=c(0,40)) + theme_bw() + ggtitle("Necessity by Context")+ geom_text(aes(label = N.Participants), hjust=0.5, vjust=-0.25, size = 2.5,position=position_dodge(width=0.9))

> ggplot(ggdf,aes(x=Context,y=N.Participants,fill=Necessity)) + geom_bar(position="dodge",colour="white",stat="identity") + labs(y="N participants") + scale_y_continuous(breaks=seq(0,35,10),limits=c(0,35)) + theme_bw() + ggtitle("Necessity by Context")+ geom_text(aes(label = N.Participants), hjust=0.5, vjust=-0.25, size = 2.5,position=position_dodge(width=0.9))
## Warning: Removed 1 rows containing missing values (geom_bar).
## Warning: Removed 1 rows containing missing values (geom_text).

Correlation plot of items by context
Italian in Australia
> cov <- cor(filtered_conv[filtered_conv$Context == "Italian in Australia",likert_variables1[!(likert_variables1 %in% "necessity1")]],method = "pearson",use="pairwise.complete.obs")
>
>
> row_infos <- data.frame(Variables=sapply(strsplit(colnames(cov),split="\\."),function(x) x[2]))
> row_infos$Variables <- as.character(row_infos$Variables)
> rownames(row_infos) <- rownames(cov)
> row_infos$Variables[which(is.na(row_infos$Variables))] <- c("educated")
> row_infos <- row_infos[order(row_infos$Variables),,drop=FALSE]
>
> ann_col_wide <- data.frame(Variable=unique(row_infos$Variables))
> ann_colors_wide <- list(Variables=c(comm1="#bd0026",educated="#b35806", id1="#f6e8c3",instru1="#35978f",integr1="#386cb0",intr1="#ffff99",ought1="grey",post1="black",prof1="pink"))
>
> #pheatmap(cov, main = "Italian in Australia",annotation_names_row = FALSE,cluster_cols=TRUE,cluster_rows=TRUE,annotation_col = row_infos[,1,drop=FALSE], annotation_row = row_infos[,1,drop=FALSE], annotation_colors = ann_colors_wide,breaks=seq(-1,1,0.2),col=c("#67001f","#b2182b","#d6604d","#f4a582","#fddbc7","#f7f7f7","#d1e5f0","#92c5de","#4393c3","#2166ac","#053061"),show_colnames = FALSE,width = 7,height = 7)
> ###################
>
> diag(cov) <- NA
> pheatmap(cov, main = "Italian in Australia",annotation_names_row = FALSE,cluster_cols=TRUE,cluster_rows=TRUE,annotation_col = row_infos[,1,drop=FALSE], annotation_row = row_infos[,1,drop=FALSE]
+ , annotation_colors = ann_colors_wide,show_colnames = FALSE,breaks = seq(-0.6,0.7,length.out = 50),width = 7,height = 7,color=colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(50))

German in Australia
> cov <- cor(filtered_conv[filtered_conv$Context == "German in Australia",likert_variables1[!(likert_variables1 %in% "necessity1")]],method = "pearson",use="pairwise.complete.obs")
>
> row_infos <- data.frame(Variables=sapply(strsplit(colnames(cov),split="\\."),function(x) x[2]))
> row_infos$Variables <- as.character(row_infos$Variables)
> rownames(row_infos) <- rownames(cov)
> row_infos$Variables[which(is.na(row_infos$Variables))] <- c("educated")
> row_infos <- row_infos[order(row_infos$Variables),,drop=FALSE]
>
> ann_col_wide <- data.frame(Variable=unique(row_infos$Variables))
> ann_colors_wide <- list(Variables=c(comm1="#bd0026",educated="#b35806", id1="#f6e8c3",instru1="#35978f",integr1="#386cb0",intr1="#ffff99",ought1="grey",post1="black",prof1="pink"))
>
> diag(cov) <- NA
> pheatmap(cov, main = "German in Australia",annotation_names_row = FALSE,cluster_cols=TRUE,cluster_rows=TRUE,annotation_col = row_infos[,1,drop=FALSE], annotation_row = row_infos[,1,drop=FALSE]
+ , annotation_colors = ann_colors_wide,show_colnames = FALSE,breaks = seq(-0.6,0.7,length.out = 50),width = 7,height = 7,color=colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(50))

English in Germany
> cov <- cor(filtered_conv[filtered_conv$Context == "English in Germany",likert_variables1[!(likert_variables1 %in% c("reconnect.comm1", "speakersmelb.comm1","comecloser.comm1","educated1"))]],method = "pearson",use="pairwise.complete.obs")
>
> row_infos <- data.frame(Variables=sapply(strsplit(colnames(cov),split="\\."),function(x) x[2]))
> row_infos$Variables <- as.character(row_infos$Variables)
> rownames(row_infos) <- rownames(cov)
> row_infos$Variables[which(is.na(row_infos$Variables))] <- c("necessity")
> row_infos <- row_infos[order(row_infos$Variables),,drop=FALSE]
>
> ann_col_wide <- data.frame(Variable=unique(row_infos$Variables))
> ann_colors_wide <- list(Variables=c(id1="#f6e8c3",necessity="#b35806",instru1="#35978f",integr1="#386cb0",intr1="#ffff99",ought1="grey",post1="black",prof1="pink"))
>
> diag(cov) <- NA
> pheatmap(cov, main = "English in Germany",annotation_names_row = FALSE,cluster_cols=TRUE,cluster_rows=TRUE,annotation_col = row_infos[,1,drop=FALSE], annotation_row = row_infos[,1,drop=FALSE]
+ , annotation_colors = ann_colors_wide,show_colnames = FALSE,breaks = seq(-0.6,0.7,length.out = 50),width = 7,height = 7,color=colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(50))

English in Italy
> cov <- cor(filtered_conv[filtered_conv$Context == "English in Italy",likert_variables1[!(likert_variables1 %in% c("reconnect.comm1","speakersmelb.comm1","comecloser.comm1","educated1"))]],method = "pearson",use="pairwise.complete.obs")
>
> row_infos <- data.frame(Variables=sapply(strsplit(colnames(cov),split="\\."),function(x) x[2]))
> row_infos$Variables <- as.character(row_infos$Variables)
> rownames(row_infos) <- rownames(cov)
> row_infos$Variables[which(is.na(row_infos$Variables))] <- "necessity"
> row_infos <- row_infos[order(row_infos$Variables),,drop=FALSE]
>
> ann_col_wide <- data.frame(Variable=unique(row_infos$Variables))
> ann_colors_wide <- list(Variables=c(comm1="#bd0026",necessity="#b35806", id1="#f6e8c3",instru1="#35978f",integr1="#386cb0",intr1="#ffff99",ought1="grey",post1="black",prof1="pink"))
>
> diag(cov) <- NA
> pheatmap(cov, main = "English in Italy",annotation_names_row = FALSE,cluster_cols=TRUE,cluster_rows=TRUE,annotation_col = row_infos[,1,drop=FALSE], annotation_row = row_infos[,1,drop=FALSE]
+ , annotation_colors = ann_colors_wide,show_colnames = FALSE,breaks = seq(-0.6,0.7,length.out = 50),width = 7,height = 7,color=colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(50))

All context together
> cov <- cor(filtered_conv[,likert_variables1],method = "pearson",use="pairwise.complete.obs")
>
> row_infos <- data.frame(Variables=sapply(strsplit(colnames(cov),split="\\."),function(x) x[2]))
> row_infos$Variables <- as.character(row_infos$Variables)
> rownames(row_infos) <- rownames(cov)
> row_infos$Variables[which(is.na(row_infos$Variables))] <- c("necessity","educated")
> row_infos <- row_infos[order(row_infos$Variables),,drop=FALSE]
>
> ann_col_wide <- data.frame(Variable=unique(row_infos$Variables))
> ann_colors_wide <- list(Variables=c(comm1="#bd0026",educated="orange", id1="#f6e8c3",instru1="#35978f",necessity="#b35806",integr1="#386cb0",intr1="#ffff99",ought1="grey",post1="black",prof1="pink"))
>
> diag(cov) <- NA
> pheatmap(cov, main = "All Contexts",annotation_names_row = FALSE,cluster_cols=TRUE,cluster_rows=TRUE,annotation_col = row_infos[,1,drop=FALSE], annotation_row = row_infos[,1,drop=FALSE]
+ , annotation_colors = ann_colors_wide,show_colnames = FALSE,breaks = seq(-0.6,0.7,length.out = 50),width = 7,height = 7,color=colorRampPalette(brewer.pal(n = 7, name = "RdBu"))(50))

Evaluate internal consistency of known constructs with alpha
> sets <- list(id.var=likert_variables1[grep("\\.id1$",likert_variables1)],
+ ought.var=likert_variables1[grep("\\.ought1$",likert_variables1)],
+ intr.var=likert_variables1[grep("\\.intr1$",likert_variables1)],
+ instru.var=likert_variables1[grep("\\.instru1$",likert_variables1)],
+ integr1.var=likert_variables1[grep("\\.integr1$",likert_variables1)],
+ prof.var=likert_variables1[grep("\\.prof1$",likert_variables1)],
+ post.var=likert_variables1[grep("\\.post1$",likert_variables1)],
+ comm.var=likert_variables1[grep("\\.comm1$",likert_variables1)])
>
>
> get_alpha <- function(dataMot,
+ var=sets$id.var){
+ var_alpha <- alpha(dataMot[,var])
+ dataf <- data.frame(alpha=var_alpha$total,
+ drop = var_alpha$alpha.drop)
+ rownames(dataf) <- rownames(var_alpha$alpha.drop)
+ return(dataf)
+ }
>
> # "Italian in Australia"
> ita_in_au <- do.call(rbind,lapply(sets,function(x) {
+ get_alpha(data=filtered_conv[filtered_conv$Context == "Italian in Australia",],
+ var=x)}))
> ita_in_au$var <- sapply(strsplit(rownames(ita_in_au),split="\\."),function(x) x[1])
> ita_in_au$var.full <- sapply(strsplit(rownames(ita_in_au),split="\\."),function(x) x[3])
> ita_in_au$Context <- "Italian in Australia"
> rownames(ita_in_au) <- NULL
>
> # "German in Australia"
> germ_in_au <- do.call(rbind,lapply(sets,function(x) {
+ get_alpha(data=filtered_conv[filtered_conv$Context == "German in Australia",],
+ var=x)}))
## Warning in alpha(dataMot[, var]): Some items were negatively correlated with the total scale and probably
## should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( knowledge.instru1 ) were negatively correlated with the total scale and
## probably should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
> germ_in_au$var <- sapply(strsplit(rownames(germ_in_au),split="\\."),function(x) x[1])
> germ_in_au$var.full <- sapply(strsplit(rownames(germ_in_au),split="\\."),function(x) x[3])
> germ_in_au$Context <- "German in Australia"
> rownames(germ_in_au) <- NULL
>
> # "English in Germany"
> eng_in_germ <- do.call(rbind,lapply(sets[!(names(sets) %in% "comm.var")],function(x) {
+ get_alpha(data=filtered_conv[filtered_conv$Context == "English in Germany",],
+ var=x)}))
## Warning in alpha(dataMot[, var]): Some items were negatively correlated with the total scale and probably
## should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( people.ought1 ) were negatively correlated with the total scale and
## probably should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
> # the ones that makes issues
> get_alpha(data=filtered_conv[filtered_conv$Context == "English in Germany",],
+ var=sets$ought.var)
## Warning in alpha(dataMot[, var]): Some items were negatively correlated with the total scale and probably
## should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
## Some items ( people.ought1 ) were negatively correlated with the total scale and
## probably should be reversed.
## To do this, run the function again with the 'check.keys=TRUE' option
## alpha.raw_alpha alpha.std.alpha alpha.G6.smc.
## consider.ought1 0.3833548 0.4871755 0.5210789
## people.ought1 0.3833548 0.4871755 0.5210789
## expect.ought1 0.3833548 0.4871755 0.5210789
## fail.ought1 0.3833548 0.4871755 0.5210789
## alpha.average_r alpha.S.N alpha.ase alpha.mean alpha.sd
## consider.ought1 0.1919167 0.9499849 0.1246586 2.192857 0.5301959
## people.ought1 0.1919167 0.9499849 0.1246586 2.192857 0.5301959
## expect.ought1 0.1919167 0.9499849 0.1246586 2.192857 0.5301959
## fail.ought1 0.1919167 0.9499849 0.1246586 2.192857 0.5301959
## drop.raw_alpha drop.std.alpha drop.G6.smc. drop.average_r
## consider.ought1 0.10658734 0.2603802 0.3349500 0.10502423
## people.ought1 0.66770987 0.6922109 0.6147672 0.42846015
## expect.ought1 0.07902542 0.1229898 0.1717773 0.04465828
## fail.ought1 0.32828494 0.4122936 0.3966550 0.18952428
## drop.S.N drop.alpha.se
## consider.ought1 0.3520461 0.19135896
## people.ought1 2.2489778 0.06738635
## expect.ought1 0.1402376 0.19008086
## fail.ought1 0.7015298 0.14252260
> eng_in_germ$var <- sapply(strsplit(rownames(eng_in_germ),split="\\."),function(x) x[1])
> eng_in_germ$var.full <- sapply(strsplit(rownames(eng_in_germ),split="\\."),function(x) x[3])
> eng_in_germ$Context <- "English in Germany"
> rownames(eng_in_germ) <- NULL
>
> # "English in Italy"
> eng_in_ita <- do.call(rbind,lapply(sets[!(names(sets) %in% "comm.var")],function(x) {
+ get_alpha(data=filtered_conv[filtered_conv$Context == "English in Italy",],
+ var=x)}))
> eng_in_ita$var <- sapply(strsplit(rownames(eng_in_ita),split="\\."),function(x) x[1])
> eng_in_ita$var.full <- sapply(strsplit(rownames(eng_in_ita),split="\\."),function(x) x[3])
> eng_in_ita$Context <- "English in Italy"
> rownames(eng_in_ita) <- NULL
>
>
> # combine
> full_alpha <- rbind(eng_in_ita,eng_in_germ,germ_in_au,ita_in_au)
> full_alpha %>% group_by(Context,var) %>%
+ summarise(st.alpha = unique(alpha.std.alpha),
+ G6=unique(alpha.G6.smc.)) %>%
+ ggplot(.,aes(x=var,y=st.alpha,colour=Context)) + geom_point() + geom_line(aes(group=Context)) + theme_bw()

> all_melt <- all_melt %>% separate(variable,into=c("item","type"),sep="\\.",remove=FALSE)
## Warning: Expected 2 pieces. Missing pieces filled with `NA` in 646 rows
## [9368, 9369, 9370, 9371, 9372, 9373, 9374, 9375, 9376, 9377, 9378, 9379,
## 9380, 9381, 9382, 9383, 9384, 9385, 9386, 9387, ...].
> p1=ggplot(all_melt,aes(x=variable,fill=value)) + geom_bar(position = "stack") +
+ facet_grid(Context~type,scales = "free") + ggtitle("Filtered dataset")+theme(axis.text.x = element_text(angle = 45, hjust = 1),axis.text=element_text(size=8))+theme_bw()
>
> p2=ggplot(full_alpha,aes(x=var.full,y=drop.std.alpha,colour=Context)) + geom_point() + geom_line(aes(group=Context)) + theme_bw() + facet_wrap(~var,scales="free")
>
> p4=ggplot(full_alpha,aes(x=var.full,y=drop.average_r,colour=Context)) + geom_point() + geom_line(aes(group=Context)) + theme_bw() + facet_wrap(~var,scales="free")
>
> p3=full_alpha %>% group_by(Context,var) %>%
+ summarise(st.alpha = unique(alpha.std.alpha),
+ G6=unique(alpha.G6.smc.)) %>%
+ ggplot(.,aes(x=var,y=st.alpha,colour=Context)) + geom_point() + geom_line(aes(group=Context)) + theme(axis.text.x = element_text(angle = 45, hjust = 1),axis.text=element_text(size=8)) + theme_bw()
>
>
> cowplot::plot_grid(p2,p3,nrow=2)
